Extract Features for Training


In [1]:
# enable magic
from IPython.parallel import Client
c = Client()

In [2]:
%%px --local
import numpy as np
import pandas as pd
from kobra.dr import Labels
from kobra.tr_utils import prep_out_path, time_now_str
import os
from os import path
import shutil
import mahotas as mh
import mahotas.labeled as mhl
import cv2
from kobra.dr import ImageReader
import time

labels_file =  '/kaggle/retina/trainLabels.csv'
root = '/kaggle/retina/reduced/test'

masks_dir = '/kaggle/retina/test/prepmasks'
features_path = '/kaggle/retina/reduced/features/test'
prefix = 'features'

def get_predicted_region(im, marker):
    res = im.copy()
    res[res != marker] = 0
    return res

names = pd.read_csv(labels_file)
n_bins = 100
isTest = True

In [3]:
prep_out_path(features_path)
files = os.listdir(root)

In [4]:
def get_areal_features(f):
    if not isTest:
        label = names.loc[names['image'] == path.splitext(f)[0]]
    else:
        label = path.splitext(f)[0]
    start = time.time()
    imr = ImageReader(root, f, masks_dir, gray_scale = True)

    drusen = get_predicted_region(imr.image, Labels.Drusen)
    blood = get_predicted_region(imr.image, Labels.Haemorage)

    Bc = np.ones((5, 5))
    labels_drusen, n_drusen = mh.label(drusen, Bc)
    labels_blood, n_blood = mh.label(blood, Bc)

    area = float(cv2.countNonZero(imr.mask))

    outp = np.array([], dtype = np.int)

    # sizes excluding background
    sizes_drusen = mhl.labeled_size(labels_drusen)[1:] / area
    sizes_blood = mhl.labeled_size(labels_blood)[1:] / area

    hist_druzen, _ = np.histogram(sizes_drusen, n_bins, (0, 1e-3))
    hist_blood, _ = np.histogram(sizes_blood, n_bins, (0, 1e-3))


    outp = np.r_[outp, hist_druzen]
    outp = np.r_[outp, hist_blood]
    if not isTest:
        outp = np.r_[outp, label.values[0]]
    else:
        outp = np.r_[outp, [label, -1]]        
    return outp

In [5]:
f = '1_left.png'
get_areal_features(f)


Out[5]:
array(['40', '31', '20', '13', '4', '6', '4', '1', '2', '0', '0', '0', '1',
       '2', '0', '0', '2', '2', '0', '1', '0', '1', '1', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '2', '0', '0', '0', '1', '0',
       '0', '1', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '32', '1', '0', '0',
       '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '1_left', '-1'], 
      dtype='|S11')

In [6]:
dv = Client().load_balanced_view()

features = dv.map(get_areal_features, np.array(files))
features.wait()
df = pd.DataFrame(data = features[:], columns = range(n_bins * 2) + ['name', 'level'])

df.to_csv(path.join(features_path, prefix + ".csv"), index = False, header=True)